Instruction¶

I have a typical project of predicting the NYC uber/lyft trip demand. The dataset is available from Jan2022 to March 2023. The area is already divided into different locations. and I want the predicted demand for each location every 15 mins

Problem statment¶

The goal of this project is to predict the demand for Uber/Lyft trips in different locations of NYC every 15 minutes, using a dataset spanning from January 2022 to March 2023. The dataset includes information such as the dispatching base number, pickup datetime, drop-off datetime, pickup location ID, drop-off location ID, SR_Flag, and affiliated base number

In [1]:
import pandas as pd # Import the pandas library to work with data in a tabular format,
import glob # glob library to retrieve file paths using patterns
import tqdm # ibrary to display progress bars
import plotly.graph_objects as go # library to create interactive plots
from statsmodels.tsa.arima.model import ARIMA # class from statsmodels.tsa.arima.model to perform ARIMA modeling
from dateutil.relativedelta import relativedelta #  from dateutil.relativedelta to manipulate dates, numpy for numerical operations
import numpy as np # for numerical operations
from pmdarima import auto_arima # auto_arima from pmdarima library to automatically select the best ARIMA model

# Summary :
# Overall, this code imports the necessary libraries
# for time series analysis, including ARIMA modeling, 
# and utilizes the auto_arima function to automatically 
# select the best ARIMA model based on the data provided.
In [2]:
# Uses the glob.glob function to retrieve a list of file paths that match the specified 
# pattern 'Datasets/fhv_tripdata_2022-2023_in_csv/*.csv'. 
# This pattern is used to find all CSV files in the given directory.
data_list_path = glob.glob('Datasets/fhv_tripdata_2022-2023_in_csv/*.csv')

# Initializes an empty list called list_df to store the DataFrames.
list_df = []
# terates over each file path in data_list_path
for path in data_list_path:
    print(path)
    # Step 1: Preprocess the Dataset
    # inside the loop, it reads each CSV file using pd.read_csv and assigns it to the variable df.
    df = pd.read_csv(path)
    # Appends the DataFrame to the list_df list.
    list_df.append(df)
    
# After the loop, it concatenates all the DataFrames in list_df into a single DataFrame using pd.concat. 
# The concatenated DataFrame is assigned to the variable df
df =  pd.concat(list_df)

# Specifies a list of column names ('pickup_datetime' and 'PUlocationID') 
# in interested_features that you are interested in keeping
interested_features = ['pickup_datetime','PUlocationID']
# Updates df to contain only the columns specified in interested_features using indexing.
df = df[interested_features]


# Summary :
# Overall, this code reads multiple CSV files from the specified directory, 
# concatenates them into a single DataFrame, and then selects and keeps only the columns specified in interested_features
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-09.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-02.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-04.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-07.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-06.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-08.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-03.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-11.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-12.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-02.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-03.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-01.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-05.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-10.csv
In [3]:
# The code imports the necessary libraries:
import pandas as pd
import pmdarima as pm
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

# Prints the number of rows in the DataFrame df before removing rows with NaN values
# This line uses the .shape[0] attribute of a DataFrame to retrieve the number of rows.
print('Number of Rows Before Removing NaN:', df.shape[0])
# Removes rows with NaN values from the DataFrame df and assigns the result to removed_nan_df:
removed_nan_df = df.dropna()
#The .dropna() method is used to remove rows containing any NaN values. 
#The resulting DataFrame with NaN rows removed is assigned to removed_nan_df/
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])
Number of Rows Before Removing NaN: 17712727
Number of Rows After Removing NaN: 4164902
In [4]:
from prophet import Prophet
import os

print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])

location_ids = removed_nan_df['PUlocationID'].unique().tolist()

for lc_id in location_ids:
    print('Location ID:', lc_id)
    df_subset = removed_nan_df[removed_nan_df['PUlocationID'] == lc_id]
    df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime'])
    df_subset = df_subset.sort_values('pickup_datetime')
    df_subset = df_subset.set_index('pickup_datetime')
    df_subset = df_subset['PUlocationID'].resample('1H').count()
    df_subset = df_subset.reset_index()

    # Split data into training and testing sets
    train_size = int(len(df_subset) * 0.95)
    train_data = df_subset[:train_size]
    test_data = df_subset[train_size:]

    # Prepare data for Prophet model
    prophet_train_data = train_data.rename(columns={'pickup_datetime': 'ds', 'PUlocationID': 'y'})

    # Create and fit the Prophet model
    model = Prophet(
        seasonality_mode='additive',
        daily_seasonality=True,  # Disable daily seasonality
        weekly_seasonality=True,  # Enable weekly seasonality
        yearly_seasonality=False,  # Disable yearly seasonality
    )
    model.fit(prophet_train_data)

    # Generate future dates for prediction
    future_dates = model.make_future_dataframe(periods=len(test_data), freq='H')

    # Make predictions
    forecast = model.predict(future_dates)
    forecast = forecast[['ds', 'yhat']][-len(test_data):]

    # Create a new dataframe to store actual and predicted values
    result_df = pd.DataFrame({
        'Actual': test_data['PUlocationID'].values,
        'Prediction': forecast['yhat'].values
    })

    # Save the dataframe as a CSV file with the location ID as the filename
    filename = f'prophet-results/{lc_id}.csv'
    result_df.to_csv(filename, index=False)

    # Plotting
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=prophet_train_data['ds'], y=prophet_train_data['y'], mode='lines+markers', name='Training Data'))
    fig.add_trace(go.Scatter(x=test_data['pickup_datetime'], y=test_data['PUlocationID'], mode='lines+markers', name='Testing Data'))
    fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines+markers', name='Prophet Forecast'))
    fig.update_layout(title=f'PickLocation ID: {lc_id} - Facebook Prophet', xaxis_title='Time', yaxis_title='Number Drives')
    fig.show()
    break
/home/iffi/anaconda3/envs/sep_darts_2/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Number of Rows Before Removing NaN: 17712727
Number of Rows After Removing NaN: 4164902
Location ID: 12.0
/tmp/ipykernel_10821/1487895832.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime'])
20:10:14 - cmdstanpy - INFO - Chain [1] start processing
20:10:18 - cmdstanpy - INFO - Chain [1] done processing